import transformers
import torch


def Llama31_example(llama31_model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"):
    """
    Example of using the LLaMA 3.1 model with vLLM for inference.
    This example demonstrates how to set up a text generation pipeline
    using the Hugging Face Transformers library and vLLM backend.

    Make sure to install the required libraries:
    pip install transformers vllm

    Note: This example assumes you have access to the Meta-Llama-3.1-8B-Instruct model.
    You may need to accept the model license on Hugging Face Hub.
    """
    # Load the model and tokenizer
    pipeline = transformers.pipeline(
        "text-generation",
        model=llama31_model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )

    messages = [
        {
            "role": "system",
            "content": "You are a pirate chatbot who always responds in pirate speak!",
        },
        {"role": "user", "content": "Who are you?"},
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=256,
    )
    print(outputs[0]["generated_text"][-1])
